package org.elasticsearch.common.lucene.search.vectorhighlight;

import gnu.trove.set.hash.TCharHashSet;
import org.apache.lucene.search.vectorhighlight.BoundaryScanner;

/**
 * A copy of Lucene {@link org.apache.lucene.search.vectorhighlight.SimpleBoundaryScanner}.
 * <p/>
 * Uses specialized char set to lookup boundary, and fixes a problem with start offset in the
 * beginning of the text: https://issues.apache.org/jira/browse/LUCENE-3697 (which has a problem
 * with multiple empty fields to highlight...).
 */
public class SimpleBoundaryScanner2 implements BoundaryScanner {

    public static final int DEFAULT_MAX_SCAN = 20;
    public static final char[] DEFAULT_BOUNDARY_CHARS = {'.', ',', '!', '?', ' ', '\t', '\n'};

    public static final SimpleBoundaryScanner2 DEFAULT = new SimpleBoundaryScanner2();

    public int maxScan;
    public TCharHashSet boundaryChars;

    public SimpleBoundaryScanner2() {
        this(DEFAULT_MAX_SCAN, DEFAULT_BOUNDARY_CHARS);
    }

    public SimpleBoundaryScanner2(int maxScan, char[] boundaryChars) {
        this.maxScan = maxScan;
        this.boundaryChars = new TCharHashSet(boundaryChars);
    }

    public int findStartOffset(StringBuilder buffer, int start) {
        // avoid illegal start offset
        if (start > buffer.length() || start < 1) return start;
        int offset, count = maxScan;
        for (offset = start; offset > 0 && count > 0; count--) {
            // found?
            if (boundaryChars.contains(buffer.charAt(offset - 1))) return offset;
            offset--;
        }
        // LUCENE-3697
        if (offset == 0) {
            return 0;
        }
        // not found
        return start;
    }

    public int findEndOffset(StringBuilder buffer, int start) {
        // avoid illegal start offset
        if (start > buffer.length() || start < 0) return start;
        int offset, count = maxScan;
        //for( offset = start; offset <= buffer.length() && count > 0; count-- ){
        for (offset = start; offset < buffer.length() && count > 0; count--) {
            // found?
            if (boundaryChars.contains(buffer.charAt(offset))) return offset;
            offset++;
        }
        // not found
        return start;
    }
}
